DET File Structure

The provided training data include image file raw 16 bits format and the detection list.

A detection list associated with the set of images which contains a list of information for the detected objects, in space delimited format with 16 columns and each row representing a single detection in one of the 4 original FITS images.

The columns are:

  • Unique ID -- An identifier for what detected object a row belongs to
  • Detection Number -- sequential numbering of detection output of the currently used detection software
  • Frame Number -- which observation is this row relevant to (1, 2, 3 or 4)
  • Sexnum -- Source extractor number of the object
  • Time -- Julian date
  • RA -- right ascension of object in decimal hours
  • DEC -- declination in decimal degrees
  • X -- location in pixels of the object in the original FITS image
  • Y -- location in pixels of the object in the original FITS image
  • Magnitude -- brightness of the object in magnitudes
  • FWHM -- full width at half maximum of Gaussian fit in pixels
  • Elong -- ratio of long axis to short axis
  • Theta -- position angle of the long axis
  • RMSE -- error in fit to straight line
  • Deltamu -- from Source Extractor, peak value minus threshold over background
  • Rejected -- this value will be 1 if the operator rejected the detection, 0 otherwise. This column will only be available during the training phase. You need to predict this column

TRAINING

1. Training with original columns


In [11]:
import pandas as pd

def onetofour(x):
    return x[0],x[1],x[2],x[3]

names = ["id","det_num", "frame_num", "sex_num", "time", "RA", "DEC", "X", "Y", "mag", "FWHE", "Elong", "theta", "RMSE", "rejected"]
df = pd.read_csv("E:/Dai Hoc/Deep Learning/Near Object/hackspace-2016/data/det_files/01_12DEC03_N01014.det", header=None, names=names, delim_whitespace=True)

dfg = df.groupby('id',as_index=False).agg(lambda x: x.tolist())

new_names = ["RA", "DEC", "X", "Y", "mag", "FWHE", "Elong", "theta", "RMSE", "rejected"]
new_df = pd.DataFrame()

new_df['id'] = dfg['id']

for nn in new_names:
    new_df[nn + '0'],new_df[nn + '1'],new_df[nn + '2'],new_df[nn + '3'] = zip(*dfg[nn].map(onetofour))

# new_df['rejected'] = zip(*dfg['rejected'].map(onebyone))
new_df.drop(['rejected1','rejected2','rejected3'], axis=1, inplace=True)

new_df


Out[11]:
id RA0 RA1 RA2 RA3 DEC0 DEC1 DEC2 DEC3 X0 ... Elong3 theta0 theta1 theta2 theta3 RMSE0 RMSE1 RMSE2 RMSE3 rejected0
0 9 1.10287 1.09605 1.08769 1.08077 1231.906 1216.889 1184.891 1206.527 1599.445 ... 40.8 1.15 1.15 1.15 1.15 0.75 0.63 0.85 0.55 1
1 14 2.64320 2.64439 2.64506 2.64546 1588.978 1584.985 1567.276 1600.578 3816.586 ... -51.1 1.03 1.03 1.03 1.03 0.94 0.73 0.71 1.17 0
2 15 0.32087 0.33389 0.34766 0.36181 1606.368 1601.588 1580.328 1613.180 475.475 ... 45.4 1.63 1.63 1.63 1.63 0.45 0.77 0.92 0.39 1
3 16 0.80880 0.80142 0.79528 0.78851 1682.412 1668.442 1640.725 1662.952 1177.924 ... 90.0 1.66 1.66 1.66 1.66 0.51 0.66 0.49 0.42 1
4 25 1.40177 1.40288 1.40415 1.40520 2359.459 2355.695 2338.046 2371.710 2033.481 ... -15.0 0.71 0.71 0.71 0.71 0.86 0.87 1.28 1.26 0
5 26 1.57154 1.56891 1.56676 1.56490 2441.502 2439.360 2422.811 2457.871 2278.006 ... -74.6 0.84 0.84 0.84 0.84 0.85 0.87 0.82 0.90 0
6 36 2.69883 2.70075 2.70079 2.70141 3290.256 3279.316 3252.771 3277.572 3903.198 ... 70.9 1.65 1.65 1.65 1.65 0.43 0.70 0.72 0.67 1
7 37 0.60061 0.60100 0.60131 0.60188 3610.243 3605.142 3583.382 3614.265 885.432 ... 0.0 0.77 0.77 0.77 0.77 0.43 0.47 0.89 0.15 1
8 50 2.79557 2.79155 2.78740 2.78329 205.287 237.773 257.854 326.350 4031.637 ... 0.0 0.76 0.76 0.76 0.76 1.08 0.00 1.12 1.15 1
9 58 2.29163 2.23216 2.16983 2.11012 3354.141 3337.433 3305.431 3324.947 3317.545 ... 8.3 0.61 0.61 0.61 0.61 1.00 0.00 1.18 1.15 1
10 62 1.07723 1.07181 1.06610 1.06070 3506.649 3505.657 3489.361 3525.458 1570.936 ... 88.9 0.51 0.51 0.51 0.51 1.00 0.00 1.19 1.73 1

11 rows × 38 columns


In [4]:
import glob
import os
import pandas as pd

from IPython.display import display, HTML

DIR = "E:/Dai Hoc/Deep Learning/Near Object/hackspace-2016/data/det_files/"
OUT_DIR = "E:/Dai Hoc/Deep Learning/Near Object/hackspace-2016/data/det_training_files/"

names = glob.glob(DIR + "*.det")

header = ["id", "det_num", "frame_num", "sex_num", "time", "RA", "DEC", "X", "Y", "mag", "FWHE", "Elong", "theta", "RMSE", "rejected"]

new_names = ["RA", "DEC", "X", "Y", "mag", "FWHE", "Elong", "theta", "RMSE", "rejected"]

training_input = []

if not os.path.exists(OUT_DIR):
    os.makedirs(OUT_DIR)

for name in names:

    df = pd.read_csv(name, header=None, names=header, delim_whitespace=True)
    
    if df.empty:
        continue
        
    dfg = df.groupby('id',as_index=False).agg(lambda x: x.tolist())
    
    new_df = pd.DataFrame()

    new_df['id'] = dfg['id']

    for nn in new_names:
        new_df[nn + '0'],new_df[nn + '1'],new_df[nn + '2'],new_df[nn + '3'] = zip(*dfg[nn].map(onetofour))

    # new_df['rejected'] = zip(*dfg['rejected'].map(onebyone))
    new_df.drop(['rejected1','rejected2','rejected3'], axis=1, inplace=True)
    
    
    training_input.append(new_df)
    
    pathOutput = OUT_DIR + name[len(DIR):]
    new_df.to_csv(pathOutput, header=None, sep=' ')

In [10]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cross_validation import train_test_split

training_data = pd.concat(training_input)

Y = training_data['rejected0']
Y = Y.as_matrix()

training_data.drop(['id', 'rejected0'], axis=1, inplace=True)
X = training_data.as_matrix()

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 42)

# fit estimator
est = GradientBoostingClassifier(n_estimators=200, max_depth=3)
est.fit(X_train, y_train)

# predict class labels
pred = est.predict(X_test)

# score on test data (accuracy)
acc = est.score(X_test, y_test)
print('ACC: %.4f' % acc)

# predict class probabilities
est.predict_proba(X_test)[0]


ACC: 0.9765
Out[10]:
array([ 0.00202141,  0.99797859])

In [9]:
from sklearn.naive_bayes import GaussianNB
from sklearn.cross_validation import train_test_split

training_data = pd.concat(training_input)


training_data.to_csv("test.csv", header=None, sep=' ')    
Y = training_data['rejected0']
Y = Y.as_matrix()

training_data.drop(['id', 'rejected0'], axis=1, inplace=True)
X = training_data.as_matrix()

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 42)

gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)
print (y_test != y_pred).sum()
print (y_test == y_pred).sum()


270
3130